This is a not fully edited version of our analysis on the Washington Post’s fatal police shooting dataset. We used data from mid-April, so this analysis does not account for more recent events. Everything is not in perfect order either, but that will eventually be fixed. Moslty created this markdown file because the code from our separate parts of the analysis is pretty incomprehensible in it’s current form, and takes some editing to run.

Laura

df.fatal <- read.csv("data-police-shootings-master/fatal-police-shootings-data.csv")
head(df.fatal)
##   id               name       date  manner_of_death      armed age gender
## 1  3         Tim Elliot 2015-01-02             shot        gun  53      M
## 2  4   Lewis Lee Lembke 2015-01-02             shot        gun  47      M
## 3  5 John Paul Quintero 2015-01-03 shot and Tasered    unarmed  23      M
## 4  8    Matthew Hoffman 2015-01-04             shot toy weapon  32      M
## 5  9  Michael Rodriguez 2015-01-04             shot   nail gun  39      M
## 6 11  Kenneth Joe Brown 2015-01-04             shot        gun  18      M
##   race          city state signs_of_mental_illness threat_level
## 1    A       Shelton    WA                    True       attack
## 2    W         Aloha    OR                   False       attack
## 3    H       Wichita    KS                   False        other
## 4    W San Francisco    CA                    True       attack
## 5    H         Evans    CO                   False       attack
## 6    W       Guthrie    OK                   False       attack
##          flee body_camera
## 1 Not fleeing       False
## 2 Not fleeing       False
## 3 Not fleeing       False
## 4 Not fleeing       False
## 5 Not fleeing       False
## 6 Not fleeing       False

Plot Maps

Get lonlat data and frequency for map

df.fatal <- unite(data=df.fatal, city.state, c(city, state), sep = " ", remove = FALSE) #create variable city.state for better accuracy
df.loc <- as.data.frame(table(df.fatal$city.state)) #get freq
names(df.loc)[1] <- 'city.state'
lonlat <- geocode(as.character(df.loc$city.state), source = 'dsk') #get latitude and longitude
df.loc <- na.omit(cbind(df.loc, lonlat)) #remove NA
#saveRDS(df.loc, "~/processedData/df.loc.RDS") #save df.loc if use google maps since it takes so long

Plot using white US map without Alaska or Hawaii

US <- map_data("state") #get US map data, white map
ggplot(data=US, aes(x=long, y=lat, group=group)) +
  geom_polygon(fill="white", colour="black") +
  xlim(-160, 60) + ylim(25,75) +
  geom_point(data=df.loc, inherit.aes=F, aes(x=lon, y=lat, size=Freq), colour="blue",  alpha=.8) +
  coord_cartesian(xlim = c(-130, -50), ylim=c(20,55))

Plot using google map devtools::install_github(“hadley/ggplot2@v2.2.0”) need old version of ggplot2 to use google maps

df.loc <- readRDS("processedData/df.loc.RDS") #to load
df.loc$city.state <- as.character(df.loc$city.state)

Separate noncontiguous states

df.loc$city.state <- as.character(df.loc$city.state) #change city.state to character to use grep
hawaii <- df.loc[grepl("HI$",df.loc$city.state),]
alaska <- df.loc[grepl("AK$",df.loc$city.state),]

US contiguous

map <- get_map(location=c(lon = -96.35, lat = 39.70), zoom = 4, source="google",crop=TRUE)
ggmap(map, legend = "none") +
  geom_point(aes(x = lon, y = lat, size=Freq), data = df.loc, alpha = .7, color = "darkblue") +
  theme(axis.title=element_blank(),
        axis.text=element_blank(),
        axis.ticks=element_blank())

Alaska

map <- get_map(location = "alaska", zoom = 4)
ggmap(map, legend = "none") +
  geom_point(aes(x = lon, y = lat, size=Freq), data = alaska, alpha = .7, color = "darkblue") +
  theme(axis.title=element_blank(),
        axis.text=element_blank(),
        axis.ticks=element_blank())

Hawaii

map <- get_map(location = "hawaii", zoom = 7)
ggmap(map, legend = "none") +
  geom_point(aes(x = lon, y = lat, size=Freq), data = hawaii, alpha = .7, color = "darkblue") +
  theme(axis.title=element_blank(),
        axis.text=element_blank(),
        axis.ticks=element_blank())

Clusterting

Create df with lat and lon

lonlat <- geocode(as.character(df.fatal$city.state), source = 'dsk')
df.location <- cbind(df.fatal, latlon)
#saveRDS(df.location, "processedData/df.location.RDS")

Use this data for df.fatal

df.fatal <- readRDS("processedData/df.location.RDS")
df.na <- df.fatal[rowSums(is.na(df.fatal)) > 0,] #see rows with missing values
df.fatal <- na.omit(df.fatal) #few NA, so just won't use them

Create new variable minority

df.fatal$minority <- 'white'
df.fatal$minority[df.fatal$race =='B'] <- 'black'
df.fatal$minority[df.fatal$race =='H'] <- 'hispanic'
df.fatal$minority[df.fatal$race !='B' & df.fatal$race != 'W' & df.fatal$race != 'H'] <- 'other'
df.fatal$minority <- factor(df.fatal$minority)

Create distance matrix for visualization, use Gower distance since mostly categorical data

drop <- c('name', 'date', 'city.state', 'city', 'state', 'race')
df.fatal.clean <- df.fatal[ , !(names(df.fatal) %in% drop)]
gower_dist <- daisy(df.fatal.clean[, -1], metric = "gower")

Check Gower dist works

gower_mat <- as.matrix(gower_dist)

Output most similar pair

df.fatal.clean[which(gower_mat == min(gower_mat[gower_mat != min(gower_mat)]),
        arr.ind = TRUE)[1, ], ]
##        id manner_of_death armed age gender signs_of_mental_illness
## 2015 2238            shot   gun  33      M                   False
## 1914 2134            shot   gun  33      M                   False
##      threat_level        flee body_camera       lon      lat minority
## 2015        other Not fleeing       False -118.2612 33.81332 hispanic
## 1914        other Not fleeing       False -118.2484 33.97395 hispanic

Output most dissimilar pair

df.fatal.clean[which(gower_mat == max(gower_mat[gower_mat != max(gower_mat)]),
        arr.ind = TRUE)[1, ], ]
##        id  manner_of_death armed age gender signs_of_mental_illness
## 2124 2372             shot knife  30      F                    True
## 1637 1829 shot and Tasered   gun  28      M                   False
##      threat_level flee body_camera        lon      lat minority
## 2124        other  Car       False  -97.77126 30.32637    black
## 1637       attack Foot        True -149.33601 67.09454    other

Calculate silhouette width for many k using PAM

sil_width <- c(NA)

for(i in 2:10){
  pam_fit <- pam(gower_dist,
                 diss = TRUE,
                 k = i)
  sil_width[i] <- pam_fit$silinfo$avg.width
}

Plot sihouette width (higher is better)

plot(1:10, sil_width,
     xlab = "Number of clusters",
     ylab = "Silhouette Width")

Looks like K = 2 is best

pam_fit <- pam(gower_dist, diss = TRUE, k = 2)
pam_results <- df.fatal.clean %>% dplyr::select(-id) %>%
  mutate(cluster = pam_fit$clustering) %>%
  group_by(cluster) %>%
  do(the_summary = summary(.))

Look at numerics of clusters

pam_results$the_summary
## [[1]]
##          manner_of_death          armed           age       gender  
##  shot            :1440   gun         :1122   Min.   :14.0    :   0  
##  shot and Tasered:  70   toy weapon  :  73   1st Qu.:27.0   F:  55  
##                          unarmed     :  72   Median :34.0   M:1455  
##                          vehicle     :  63   Mean   :36.9           
##                          knife       :  53   3rd Qu.:45.0           
##                          undetermined:  53   Max.   :91.0           
##                          (Other)     :  74                          
##  signs_of_mental_illness       threat_level           flee     
##  False:1171              attack      :1352              :  26  
##  True : 339              other       :  74   Car        : 217  
##                          undetermined:  84   Foot       : 192  
##                                              Not fleeing:1009  
##                                              Other      :  66  
##                                                                
##                                                                
##  body_camera       lon               lat            minority      cluster 
##  False:1367   Min.   :-168.02   Min.   :19.56   black   :384   Min.   :1  
##  True : 143   1st Qu.:-112.07   1st Qu.:33.52   hispanic:265   1st Qu.:1  
##               Median : -95.31   Median :36.07   other   :108   Median :1  
##               Mean   : -97.86   Mean   :36.72   white   :753   Mean   :1  
##               3rd Qu.: -83.92   3rd Qu.:39.91                  3rd Qu.:1  
##               Max.   : -68.10   Max.   :71.29                  Max.   :1  
##                                                                           
## 
## [[2]]
##          manner_of_death          armed          age        gender 
##  shot            :586    knife       :272   Min.   : 6.00    :  0  
##  shot and Tasered: 84    unarmed     : 88   1st Qu.:26.00   F: 37  
##                          vehicle     : 83   Median :34.00   M:633  
##                          gun         : 67   Mean   :35.64          
##                          undetermined: 44   3rd Qu.:43.00          
##                          toy weapon  : 22   Max.   :83.00          
##                          (Other)     : 94                          
##  signs_of_mental_illness       threat_level          flee     body_camera
##  False:469               attack      : 51              : 19   False:575  
##  True :201               other       :574   Car        :110   True : 95  
##                          undetermined: 45   Foot       : 65              
##                                             Not fleeing:460              
##                                             Other      : 16              
##                                                                          
##                                                                          
##       lon               lat            minority      cluster 
##  Min.   :-157.93   Min.   :19.56   black   :169   Min.   :2  
##  1st Qu.:-111.92   1st Qu.:33.53   hispanic:112   1st Qu.:2  
##  Median : -88.10   Median :36.84   other   : 63   Median :2  
##  Mean   : -95.00   Mean   :36.74   white   :326   Mean   :2  
##  3rd Qu.: -81.47   3rd Qu.:40.10                  3rd Qu.:2  
##  Max.   : -68.01   Max.   :61.58                  Max.   :2  
## 

Looks to be clustered by threat level, race, and armed Look at medoids

df.fatal.clean[pam_fit$medoids, ]
##        id manner_of_death armed age gender signs_of_mental_illness
## 395   498            shot   gun  35      M                   False
## 2100 2344            shot knife  33      M                   False
##      threat_level        flee body_camera       lon      lat minority
## 395        attack Not fleeing       False -95.97522 35.62283    white
## 2100        other Not fleeing       False -89.57461 37.33529    white

Dimension reduction

tSNE, t-distributed stochastic neighborhood embedding

tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)
tsne_data <- tsne_obj$Y %>%
  data.frame() %>%
  setNames(c("X", "Y"))

Lets look at some variables in 2D to see if anything separates well

tsne_data <-  data.frame(cluster = factor(pam_fit$clustering), df.fatal.clean, tsne_data, race=df.fatal$race)
ggplot(aes(x = X, y = Y), data = tsne_data) + geom_point(aes(color = cluster))

ggplot(aes(x = X, y = Y), data = tsne_data) + geom_point(aes(color=threat_level))

ggplot(aes(x = X, y = Y), data = tsne_data) + geom_point(aes(color=manner_of_death))

ggplot(aes(x = X, y = Y), data = tsne_data) + geom_point(aes(color=signs_of_mental_illness))

ggplot(aes(x = X, y = Y), data = tsne_data) + geom_point(aes(color=body_camera))

ggplot(aes(x = X, y = Y), data = tsne_data) + geom_point(aes(color=minority))

ggplot(aes(x = X, y = Y), data = tsne_data) + geom_point(aes(color=race))

Can compare with MDS MDS

fit <- cmdscale(gower_dist, k=2) #k is the number of dim
#fit #view results

Plot MDS without coloring groups

fit <- as.data.frame(fit)
ggplot() + geom_point(data=fit, aes(fit[,1], fit[,2]))

Plot MDS, color by cluster

fit <- cbind(fit, df.fatal, cluster = factor(pam_fit$clustering))
ggplot() + geom_point(data=fit, aes(x=V1, y=V2, color=cluster))

Zoe

fat.dat <- readRDS("processedData/df.location.RDS")

eliminate the rows with missing entries

miss.dat <- fat.dat[rowSums(is.na(fat.dat)) == 0,]
miss.dat <- miss.dat[miss.dat$id != c(2158),]
miss.dat <- miss.dat[miss.dat$id != c(2304),]

discretize weapons

weapon_converter <- function(x){
  if(x %in% c('gun', 'guns and explosives', 'gun and knife', 'hatchet and gun',
              'machete and gun')) return(as.factor('gun'))
  if(x %in% c('knife','pole and knife','sword','machete')) return(as.factor('knife'))
  if(x %in% c('vehicle','motorcycle'))return(as.factor('vehicle'))
  if(x %in% c('','undetermined')) return(as.factor('undetermined'))
  if(x %in% c('toy weapon')) return(as.factor('toy weapon'))
  if(x == 'unarmed') return(as.factor('unarmed'))
  return(as.factor('other'))
}
weap.dat <- sapply(miss.dat$armed, weapon_converter)
use.dat <- miss.dat[,-c(1:3,9:11)]
use.dat[,2] <- weap.dat

CLASSICAL MDS – GET DISTANCES BETWEEN PEOPLE

Gower distance Euclidean distance doesn’t make sense for categorical data

gower.dist <- daisy(use.dat, metric = "gower")

try three dimensions

gower.mds3 <- cmdscale(gower.dist, k = 3, eig = TRUE)
gmds.res3 <- data.frame(gower.mds3$points)
scatter3D(x = gmds.res3$X1, y = gmds.res3$X2, z = gmds.res3$X3, colvar = as.numeric(use.dat$race),
          axis.ticks = T, ticktype = "detailed",pch = 19, cex = 0.5, bty = "g", theta = 85, phi = 15)

try two dimensions

gower.mds <- cmdscale(gower.dist, k = 2, eig = TRUE)
gmds.res <- data.frame(gower.mds$points)

plain MDS plot

ggplot(data = gmds.res, aes(x = X1, y = X2)) + geom_point(cex = 0.5) + labs(title = "Classical MDS")

colored by mental illness

ggplot(data = gmds.res, aes(x = X1, y = X2)) +
  geom_point(aes(color = use.dat$signs_of_mental_illness), cex = 0.5) +
  labs(title = "Classical MDS: Mental Illness", color = "Mental Illness") +
  guides(colour = guide_legend(override.aes = list(size=2)))

colored by threat level

ggplot(data = gmds.res, aes(x = X1, y = X2)) +
  geom_point(aes(color = use.dat$threat_level), cex = 0.5) +
  scale_color_hue(labels = c("Attack", "Other","Undet.")) +
  labs(title = "Classical MDS: Threat Level", color = "Threat Level") +
  guides(colour = guide_legend(override.aes = list(size=2)))

colored by race

ggplot(data = gmds.res, aes(x = X1, y = X2)) + geom_point(aes(color = use.dat$race), cex = 0.5) +
  labs(title = "Classical MDS: Race", color = "Race") +
  scale_color_hue(labels = c("Undet.", "Asian","Black", "Hispanic","Nat. Am.", "Other race","White")) +
  guides(colour = guide_legend(override.aes = list(size=2)))

colored by body camera

ggplot(data = gmds.res, aes(x = X1, y = X2)) +
  geom_point(aes(color = use.dat$body_camera), cex = 0.5) +
  labs(title = "Classical MDS: Body Camera", color = "Body Camera") +
  guides(colour = guide_legend(override.aes = list(size=2)))

PLOTS INVOLVING RACE

RACE IN FATAL SHOOTINGS

race.counts <- table(use.dat$race)
race.prop <- prop.table(race.counts)*100
rownames(race.prop) <- factor(c("Undet.", "Asian", "Black", "Hispanic", "Nat. Am.", "Other race", "White"))
race.prop <- data.frame(race.prop)
ggplot(data = race.prop, aes(x = Var1, y = Freq)) + geom_bar(aes(fill = Var1), stat = "identity") +
  labs(title = "Racial Breakdown: Fatal Police Shootings", y = "Percent", x = "") +
  theme(axis.text=element_text(size=12)) + guides(fill = FALSE) + ylim(c(0,100))

RACE IN THE US

us.race <- c(0, 4.7, 12.2, 16.3, 0.9, 2.1, 63.7)
us.race <- data.frame(us.race,race.prop[,1])
ggplot(data = us.race, aes(x = race.prop[,1], y = us.race)) +
  geom_bar(aes(fill = race.prop[,1]),stat = "identity") +
  labs(title = "Racial Breakdown: U.S. Population", y = "Percent", x = "") +
  theme(axis.text=element_text(size=12)) + guides(fill = FALSE) + ylim(c(0,100))

RACE AND WEAPON

rw.tab <- table(use.dat$race, use.dat$armed)
rownames(rw.tab) <- factor(c("Undet.", "Asian", "Black", "Hispanic", "Nat. Am.", "Other race", "White"))
rw.dat <- data.frame(prop.table(rw.tab,2)*100)
colnames(rw.dat)[1] <- "Race"
ggplot(data = rw.dat, aes(x = Race, y = Freq)) + facet_wrap(~Var2) +
  geom_bar(aes(fill = Race), stat = "identity") + ylim(c(0,100)) + labs(y = "Percent", x = "") +
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
  theme(strip.text = element_text(size = 11))

RACE AND THREAT LEVEL

ra.tab <- table(use.dat$race, use.dat$threat_level)
rownames(ra.tab) <- factor(c("Undet.", "Asian", "Black", "Hispanic", "Nat. Am.", "Other race", "White"))
ra.dat <- data.frame(prop.table(ra.tab,2)*100)
colnames(ra.dat)[1] <- "Race"
colnames(ra.dat)[2] <- "Threat"
ggplot(data = ra.dat, aes(x = Race, y = Freq)) + facet_wrap(~Threat) +
  geom_bar(aes(fill = Race), stat = "identity") + ylim(c(0,100)) + labs(y = "Percent", x = "") +
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
  theme(strip.text = element_text(size = 11))

ra.dat2 <- data.frame(prop.table(ra.tab,1)*100)
colnames(ra.dat2)[1] <- "Race"
colnames(ra.dat2)[2] <- "Threat"
ggplot(data = ra.dat2, aes(x = Threat, y = Freq)) + facet_wrap(~Race) +
  geom_bar(aes(fill = Threat), stat = "identity") + ylim(c(0,100)) + labs(y = "Percent", x = "") +
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
  theme(strip.text = element_text(size = 11))

RACE AND MENTAL ILLNESS

rmi.tab <- table(use.dat$race, use.dat$signs_of_mental_illness)
rownames(rmi.tab) <- factor(c("Undet.", "Asian", "Black", "Hispanic", "Nat. Am.", "Other race", "White"))
colnames(rmi.tab) <- factor(c("No Signs of Mental Illness", "Signs of Mental Illness"))
rmi.dat <- data.frame(prop.table(rmi.tab,2)*100)
colnames(rmi.dat)[1] <- "Race"
ggplot(data = rmi.dat, aes(x = Race, y = Freq)) + facet_wrap(~Var2) +
  geom_bar(aes(fill = Race), stat = "identity") + ylim(c(0,100)) + labs(y = "Percent", x = "") +
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
  theme(strip.text = element_text(size = 11))

RACE AND BODY CAMERA

rbc.tab <- table(use.dat$race, use.dat$body_camera)
rownames(rbc.tab) <- factor(c("Undet.", "Asian", "Black", "Hispanic", "Nat. Am.", "Other race", "White"))
colnames(rbc.tab) <- factor(c("No Body Camera", "Body Camera"))
rbc.dat <- data.frame(prop.table(rbc.tab,2)*100)
colnames(rbc.dat)[1] <- "Race"
ggplot(data = rbc.dat, aes(x = Race, y = Freq)) + facet_wrap(~Var2) +
  geom_bar(aes(fill = Race), stat = "identity") + ylim(c(0,100)) + labs(y = "Percent", x = "") +
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
  theme(strip.text = element_text(size = 11))

RACE AND FLEEING

rf.tab <- table(use.dat$race, use.dat$flee)
rownames(rf.tab) <- factor(c("Undet.", "Asian", "Black", "Hispanic", "Nat. Am.", "Other race", "White"))
colnames(rf.tab)[1] <- "Undetermined"
rf.dat <- data.frame(prop.table(rf.tab,2)*100)
colnames(rf.dat)[1] <- "Race"
ggplot(data = rf.dat, aes(x = Race, y = Freq)) + facet_wrap(~Var2) +
  geom_bar(aes(fill = Race), stat = "identity") + ylim(c(0,100)) + labs(y = "Percent", x = "") +
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
  theme(strip.text = element_text(size = 11))

## Kidus

binary_decision <- function(vec){
  #converts (0,0.5) to 0 and (0.5,1) to 1
  return(ifelse(vec < 0.5, 0, 1))
}

load in shared data with Laura, Zoe and Ed

master_data = readRDS("processedData/df.location.RDS")

Clean up the data to remove duplicate entries.

master_data = master_data[complete.cases(master_data),]
master_data = master_data[-which(master_data$id == 2304),]
master_data = master_data[-which(master_data$id == 2158),]

There were many different kinds of weapons that were only mentioned once, and others that were combined categories, so we chose to simplify. We created new categories “gun”, “knife”, “vehicle”, “undetermined”, “toy weapon”, “unarmed”, and “other”

weapon_converter <- function(x){
  # converts weapon categories
  if(x %in% c('gun', 'guns and explosives', 'gun and knife', 'hatchet and gun',
              'machete and gun')) return(as.factor('gun'))
  if(x %in% c('knife','pole and knife','sword','machete')) return(as.factor('knife'))
  if(x %in% c('vehicle','motorcycle'))return(as.factor('vehicle'))
  if(x %in% c('','undetermined')) return(as.factor('undetermined'))
  if(x %in% c('toy weapon')) return(as.factor('toy weapon'))
  if(x == 'unarmed') return(as.factor('unarmed'))
  return(as.factor('other'))
}

# add weapon category column
weapon_cat = sapply(master_data$armed, function(x) weapon_converter(x))

master_data = cbind(master_data, weapon_cat)
race_converter <- function(x){
  # to factor
  if(x == 'B') return(as.factor('B'))
  if(x == 'W') return(as.factor('W'))
  return(as.factor('O'))
}

# add race category column
race_cat = sapply(master_data$race, function(x) race_converter(x))
master_data = cbind(master_data, race_cat)

We were interested in any regional effects, so we subdivided the state into ‘Northeast’, ‘Midwest’, ‘South’, ‘West’.

region_converter <- function(x){
  # Create new factor "region"
  if(x == 'CT'| x == 'ME'| x == 'MA'| x == 'NH'| x == 'RI'|
     x == 'VT'| x == 'NJ'| x == 'NY'| x == 'PA') return(as.factor(1))
  if(x == 'IL'| x == 'IN'| x == 'MI'| x == 'OH'| x == 'WI'|
     x == 'IA'| x == 'KS'| x == 'MN'| x == 'MO'| x == 'NE'|
     x == 'SD'| x == 'ND') return(as.factor(2))
  if(x == 'DE'| x == 'FL'| x == 'GA'| x == 'MD'| x == 'NC'|
     x == 'SC'| x == 'VA'| x == 'DC'| x == 'WV'| x == 'AL'|
     x == 'KY'| x == 'MS'| x == 'TN'| x == 'AR'| x == 'LA'|
     x == 'OK'| x == 'TX') return(as.factor(3))
  if(x == 'AZ'| x == 'CO'| x == 'ID'| x == 'MT'| x == 'NV'|
     x == 'NM'| x == 'UT'| x == 'WY'| x == 'AK'| x == 'CA'|
     x == 'HI'| x == 'OR'| x == 'WA') return(as.factor(4))
}
region_defn = c('Northeast','Midwest','South','West')
region = sapply(master_data$state, function(x) region_converter(as.character(x)))

master_data = cbind(master_data, region)

#subset the data 
drops1 = c("state","city","city.state","name","id","region","race", "date","armed")
fatal_data1 = master_data[,!(names(master_data) %in% drops1)]

drops2 = c("state","city","city.state","name","id","race","region","date","manner_of_death","armed","gender","flee","body_camera","weapon_cat")
fatal_data2 = master_data[,!(names(master_data) %in% drops2)]

drops3 = c("state","city","city.state","name","id","lat","lon","race", "date","armed")
fatal_data3 = master_data[,!(names(master_data) %in% drops3)]

drops4 = c("state","city","city.state","name","id","race", "date","armed")
fatal_data4 = master_data[,!(names(master_data) %in% drops4)]

drops5 = c("state","city","city.state","name","id","race","region","date","manner_of_death","armed","flee","body_camera")
fatal_data5 = master_data[,!(names(master_data) %in% drops5)]

Split up our data into train and test set

smp_size <- floor(0.75 * nrow(fatal_data1)) #75% of the dataset

set the seed to make the partition reproducible

set.seed(123)
train_ind <- sample(seq_len(nrow(fatal_data1)), size = smp_size)
train <- fatal_data1[train_ind, ]
test <- fatal_data1[-train_ind, ]
Y.train <- train$race_cat
Y.test <- test$race_cat
X.train <- train[,-which(names(train)=='race_cat')]
X.test <- test[,-which(names(test)=='race_cat')]

Train Gaussian SVM on full training set.

cost = 95
gamma = 0.01
g.svm <- svm(race_cat ~., data = train,
             type = 'C-classification', kernel = 'radial',
             cost = cost, gamma = gamma)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
##                 Y.train
## g.svm.pred.train   O   W   B
##                O 205  91  52
##                W 197 653 205
##                B  25  59 146
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.6148194

0.6148194 Test Gaussian SVM on full test set

g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
##           Y.test
## g.svm.pred   O   W   B
##          O  62  31  18
##          W  51 218  79
##          B   8  27  51
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.6073394

0.6073394 POLY SVM Polynomial kernel Train Poly SVM on full training set

cost = 10
gamma = 0.01
degree = 3
coef0 = 2
g.svm <- svm(race_cat ~., data = train,
             type = 'C-classification', kernel = 'polynomial',
             cost = cost, gamma = gamma, coef0 = coef0, degree=degree)

g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
##                 Y.train
## g.svm.pred.train   O   W   B
##                O 206  95  53
##                W 198 648 212
##                B  23  60 138
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.6074709

0.6074709 Test Poly SVM on full test set

g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
##           Y.test
## g.svm.pred   O   W   B
##          O  59  30  16
##          W  55 215  82
##          B   7  31  50
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.5944954

0.5944954 Linear kernel Train Gaussian SVM on full training set

cost = 100
g.svm <- svm(race_cat ~., data = train,
             type = 'C-classification', kernel = 'linear',
             cost = cost)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
##                 Y.train
## g.svm.pred.train   O   W   B
##                O 204 112  52
##                W 195 601 204
##                B  28  90 147
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.5829761

0.5829761 Test Gaussian SVM on full test set

g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
##           Y.test
## g.svm.pred   O   W   B
##          O  61  32  18
##          W  51 210  77
##          B   9  34  53
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.5944954

0.5944954 Train Multinomial Logistic Regression on full training set

m.log.r <- multinom(race_cat~., data=train)
## # weights:  66 (42 variable)
## initial  value 1794.033867 
## iter  10 value 1580.801489
## iter  20 value 1480.560485
## iter  30 value 1477.198958
## iter  40 value 1476.880296
## final  value 1476.879822 
## converged
m.log.r.pred.train <- predict(m.log.r,newdata = X.train)
table(m.log.r.pred.train, Y.train)
##                   Y.train
## m.log.r.pred.train   O   W   B
##                  O 184 103  44
##                  W 202 594 201
##                  B  41 106 158
print(sum(m.log.r.pred.train == Y.train)/length(Y.train))
## [1] 0.5731782

0.5731782 Test Multinomial Logistic Regression on full test set

m.log.r.pred <- predict(m.log.r, newdata = X.test)
table(m.log.r.pred, Y.test)
##             Y.test
## m.log.r.pred   O   W   B
##            O  53  26  13
##            W  56 213  77
##            B  12  37  58
print(sum(m.log.r.pred == Y.test)/length(Y.test))
## [1] 0.5944954

0.5944954

BY REGION

REGION 1 - NORTHEAST

reg.data <- fatal_data4[fatal_data4$region == 1,]

split up our data into train and test set

smp_size <- floor(0.75 * nrow(reg.data)) #'75% of the dataset
set.seed(123)
train_ind <- sample(seq_len(nrow(reg.data)), size = smp_size)
train.reg <- reg.data[train_ind,]
test.reg <- reg.data[-train_ind,]
Y.train <- train.reg$race_cat
Y.test <- test.reg$race_cat
X.train <- train.reg[,-which(names(train.reg)=='race_cat')]
X.test <- test.reg[,-which(names(test.reg)=='race_cat')]

Train Gaussian SVM on region

cost = 250
gamma = 0.01
g.svm <- svm(race_cat ~. -region, data = train.reg,
             type = 'C-classification', kernel = 'radial',
             cost = cost, gamma = gamma)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
##                 Y.train
## g.svm.pred.train  O  W  B
##                O  5  1  0
##                W  7 48  8
##                B  9  9 36
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.7235772

0.7235772 Test Gaussian SVM on region

g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
##           Y.test
## g.svm.pred  O  W  B
##          O  0  2  0
##          W  3 12  5
##          B  4  4 11
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.5609756

0.5609756

REGION 2 - MIDWEST

reg.data <- fatal_data4[fatal_data4$region == 2,]

split up our data into train and test set

smp_size <- floor(0.75 * nrow(reg.data)) #'75% of the dataset
set.seed(123)
train_ind <- sample(seq_len(nrow(reg.data)), size = smp_size)
train.reg <- reg.data[train_ind,]
test.reg <- reg.data[-train_ind,]
Y.train <- train.reg$race_cat
Y.test <- test.reg$race_cat
X.train <- train.reg[,-which(names(train.reg)=='race_cat')]
X.test <- test.reg[,-which(names(test.reg)=='race_cat')]

Train Gaussian SVM on region

cost = 220
gamma = 0.1
g.svm <- svm(race_cat ~. -region, data = train.reg,
             type = 'C-classification', kernel = 'radial',
             cost = cost, gamma = gamma)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
##                 Y.train
## g.svm.pred.train   O   W   B
##                O  26   0   0
##                W   4 152   5
##                B   2   5  77
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.9409594

0.9409594 Test Gaussian SVM on region

g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
##           Y.test
## g.svm.pred  O  W  B
##          O  0  4  3
##          W  4 35 15
##          B  1  7 22
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.6263736

0.6263736 #### REGION 3 - SOUTH

reg.data <- fatal_data4[fatal_data4$region == 3,]

split up our data into train and test set

smp_size <- floor(0.75 * nrow(reg.data)) #'75% of the dataset
set.seed(123)
train_ind <- sample(seq_len(nrow(reg.data)), size = smp_size)
train.reg <- reg.data[train_ind,]
test.reg <- reg.data[-train_ind,]
Y.train <- train.reg$race_cat
Y.test <- test.reg$race_cat
X.train <- train.reg[,-which(names(train.reg)=='race_cat')]
X.test <- test.reg[,-which(names(test.reg)=='race_cat')]

Train Gaussian SVM on region

cost = 240
gamma = 0.05
g.svm <- svm(race_cat ~. -region, data = train.reg,
             type = 'C-classification', kernel = 'radial',
             cost = cost, gamma = gamma)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
##                 Y.train
## g.svm.pred.train   O   W   B
##                O  66   3   4
##                W  33 349  55
##                B   4  15 145
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.8308605

0.8308605 Test Gaussian SVM on region

g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
##           Y.test
## g.svm.pred  O  W  B
##          O 15  8  6
##          W 19 84 36
##          B  4 19 34
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.5911111

0.5911111 #### REGION 4 - WEST

reg.data <- fatal_data4[fatal_data4$region == 4,]

split up our data into train and test set

smp_size <- floor(0.75 * nrow(reg.data)) #'75% of the dataset
set.seed(123)
train_ind <- sample(seq_len(nrow(reg.data)), size = smp_size)
train.reg <- reg.data[train_ind,]
test.reg <- reg.data[-train_ind,]
Y.train <- train.reg$race_cat
Y.test <- test.reg$race_cat
X.train <- train.reg[,-which(names(train.reg)=='race_cat')]
X.test <- test.reg[,-which(names(test.reg)=='race_cat')]

Train Gaussian SVM on region

cost = 240
gamma = 0.05
g.svm <- svm(race_cat ~. -region, data = train.reg,
             type = 'C-classification', kernel = 'radial',
             cost = cost, gamma = gamma)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
##                 Y.train
## g.svm.pred.train   O   W   B
##                O 215  35  17
##                W  38 210  14
##                B   2   0  33
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.8120567

0.8120567 Test Gaussian SVM on region

g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
##           Y.test
## g.svm.pred  O  W  B
##          O 53 26 14
##          W 27 46 10
##          B  7  5  1
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.5291005

0.5291005 ### FEATURE SELECTION set the seed to make the partition reproducible EXCLUDE REGION FOR THIS PART

smp_size <- floor(0.75 * nrow(fatal_data1)) #'75% of the dataset
set.seed(123)
train_ind <- sample(seq_len(nrow(fatal_data1)), size = smp_size)
train <- fatal_data1[train_ind, ]
test <- fatal_data1[-train_ind, ]
Y.train <- train$race_cat
X.train <- train[,-which(names(train)=='race_cat')]
Y.test <- test$race_cat
X.test <- test[,-which(names(test)=='race_cat')]

X.train <- train[,-which(names(train)==‘race_cat’|names(train)==‘signs_of_mental_illness’|names(train)==‘threat_level’)]

blep <- function(x){
  if(x == 'B') return(as.factor(1))
  if(x == 'W') return(as.factor(2))
  if(x == 'O') return(as.factor(3))
}
Y.train <- sapply(Y.train, blep)

define the control using a random forest selection function

control <- rfeControl(functions=rfFuncs, method="cv", number=10)

run the RFE algorithm

results <- rfe(X.train, Y.train, sizes=c(1:8), rfeControl=control)

summarize the results

print(results)
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables Accuracy  Kappa AccuracySD KappaSD Selected
##          1   0.5230 0.2462    0.02906 0.05262         
##          2   0.5462 0.2605    0.04939 0.07722         
##          3   0.5708 0.2954    0.02759 0.04567         
##          4   0.5690 0.2946    0.04643 0.07543         
##          5   0.5713 0.2939    0.03917 0.06454         
##          6   0.5848 0.3130    0.02501 0.04204         
##          7   0.5897 0.3165    0.02767 0.04469        *
##          8   0.5891 0.3132    0.02924 0.04623         
##         10   0.5799 0.3053    0.03477 0.05571         
## 
## The top 5 variables (out of 7):
##    lon, age, lat, signs_of_mental_illness, weapon_cat

list the chosen features

predictors(results)
## [1] "lon"                     "age"                    
## [3] "lat"                     "signs_of_mental_illness"
## [5] "weapon_cat"              "threat_level"           
## [7] "gender"

plot the results

plot(results, type=c("g", "o"))

in light of the above let us only use lon, age, lat, mental illness, threat, gender, weapon_cat

set.seed(123)
train_ind <- sample(seq_len(nrow(fatal_data5)), size = smp_size)
train <- fatal_data5[train_ind, ]
test <- fatal_data5[-train_ind, ]
Y.train <- train$race_cat
Y.test <- test$race_cat
X.train <- train[,-which(names(train)=='race_cat')]
X.test <- test[,-which(names(test)=='race_cat')]

RFE TRAIN

cost = 230
gamma = 0.02
g.svm <- svm(race_cat ~., data = train,
             type = 'C-classification', kernel = 'radial',
             cost = cost, gamma = gamma)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
##                 Y.train
## g.svm.pred.train   O   W   B
##                O 209  82  50
##                W 195 662 203
##                B  23  59 150
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.6252296

0.6252296 RFE TEST

g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
##           Y.test
## g.svm.pred   O   W   B
##          O  57  28  16
##          W  58 222  83
##          B   6  26  49
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.6018349

0.6018349 ### LAT - LONG INTERACTION TERM

set.seed(123)
train_ind <- sample(seq_len(nrow(fatal_data5)), size = smp_size)
train <- fatal_data5[train_ind, ]
test <- fatal_data5[-train_ind, ]
Y.train <- train$race_cat
Y.test <- test$race_cat
X.train <- train[,-which(names(train)=='race_cat')]
X.test <- test[,-which(names(test)=='race_cat')]

RFE TRAIN

cost = 230
gamma = 0.02
g.svm <- svm(race_cat ~. + lat*lon, data = train,
             type = 'C-classification', kernel = 'radial',
             cost = cost, gamma = gamma)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
##                 Y.train
## g.svm.pred.train   O   W   B
##                O 208  75  47
##                W 194 664 198
##                B  25  64 158
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.630741

0.679078 RFE TEST

g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
##           Y.test
## g.svm.pred   O   W   B
##          O  60  26  17
##          W  55 223  76
##          B   6  27  55
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.6201835

0.5731103 ### COMBINE RFE WITH REGION STRATIFICATION

to_drop <- c("manner_of_death","flee","body_camera")
all.reg.data <- fatal_data4[,!(names(fatal_data4) %in% to_drop)]

REGION 1 - NORTHEAST

reg.data <- all.reg.data[all.reg.data$region == 1,]

split up our data into train and test set

smp_size <- floor(0.75 * nrow(reg.data)) #75% of the dataset
set.seed(123)
train_ind <- sample(seq_len(nrow(reg.data)), size = smp_size)
train.reg <- reg.data[train_ind,]
test.reg <- reg.data[-train_ind,]
Y.train <- train.reg$race_cat
Y.test <- test.reg$race_cat
X.train <- train.reg[,-which(names(train.reg)=='race_cat')]
X.test <- test.reg[,-which(names(test.reg)=='race_cat')]

Train Gaussian SVM on region

cost = 250
gamma = 0.01
g.svm <- svm(race_cat ~ . -region, data = train.reg,
             type = 'C-classification', kernel = 'radial',
             cost = cost, gamma = gamma)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
##                 Y.train
## g.svm.pred.train  O  W  B
##                O  5  1  0
##                W  8 50 10
##                B  8  7 34
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.7235772

0.7235772 Test Gaussian SVM on region

g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
##           Y.test
## g.svm.pred  O  W  B
##          O  0  1  0
##          W  3 15  6
##          B  4  2 10
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.6097561

0.6097561

REGION 2 - MIDWEST

reg.data <- all.reg.data[all.reg.data$region == 2,]

split up our data into train and test set

smp_size <- floor(0.75 * nrow(reg.data)) #75% of the dataset
set.seed(123)
train_ind <- sample(seq_len(nrow(reg.data)), size = smp_size)
train.reg <- reg.data[train_ind,]
test.reg <- reg.data[-train_ind,]
Y.train <- train.reg$race_cat
Y.test <- test.reg$race_cat
X.train <- train.reg[,-which(names(train.reg)=='race_cat')]
X.test <- test.reg[,-which(names(test.reg)=='race_cat')]

Train Gaussian SVM on region

cost = 220
gamma = 0.1
g.svm <- svm(race_cat ~ . -region, data = train.reg,
             type = 'C-classification', kernel = 'radial',
             cost = cost, gamma = gamma)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
##                 Y.train
## g.svm.pred.train   O   W   B
##                O  21   1   0
##                W   6 145   9
##                B   5  11  73
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.8819188

0.8819188 Test Gaussian SVM on region

g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
##           Y.test
## g.svm.pred  O  W  B
##          O  0  6  1
##          W  4 31 21
##          B  1  9 18
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.5384615

0.5384615 #### REGION 3 - SOUTH

reg.data <- all.reg.data[all.reg.data$region == 3,]

split up our data into train and test set

smp_size <- floor(0.75 * nrow(reg.data)) #75% of the dataset
set.seed(123)
train_ind <- sample(seq_len(nrow(reg.data)), size = smp_size)
train.reg <- reg.data[train_ind,]
test.reg <- reg.data[-train_ind,]
Y.train <- train.reg$race_cat
Y.test <- test.reg$race_cat
X.train <- train.reg[,-which(names(train.reg)=='race_cat')]
X.test <- test.reg[,-which(names(test.reg)=='race_cat')]

Train Gaussian SVM on region

cost = 240
gamma = 0.05
g.svm <- svm(race_cat ~ . -region, data = train.reg,
             type = 'C-classification', kernel = 'radial',
             cost = cost, gamma = gamma)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
##                 Y.train
## g.svm.pred.train   O   W   B
##                O  42   3   3
##                W  54 349  77
##                B   7  15 124
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.764095

0.764095 Test Gaussian SVM on region

g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
##           Y.test
## g.svm.pred  O  W  B
##          O 14  6  3
##          W 19 87 48
##          B  5 18 25
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.56

0.56 ### REGION 4 - WEST

reg.data <- all.reg.data[all.reg.data$region == 4,]

split up our data into train and test set

smp_size <- floor(0.75 * nrow(reg.data)) #75% of the dataset
set.seed(123)
train_ind <- sample(seq_len(nrow(reg.data)), size = smp_size)
train.reg <- reg.data[train_ind,]
test.reg <- reg.data[-train_ind,]
Y.train <- train.reg$race_cat
Y.test <- test.reg$race_cat
X.train <- train.reg[,-which(names(train.reg)=='race_cat')]
X.test <- test.reg[,-which(names(test.reg)=='race_cat')]

Train Gaussian SVM on region

cost = 240
gamma = 0.05
g.svm <- svm(race_cat ~ . -region, data = train.reg,
             type = 'C-classification', kernel = 'radial',
             cost = cost, gamma = gamma)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
##                 Y.train
## g.svm.pred.train   O   W   B
##                O 208  57  34
##                W  46 187  14
##                B   1   1  16
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.7287234

0.7287234 Test Gaussian SVM on region

g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
##           Y.test
## g.svm.pred  O  W  B
##          O 61 32 17
##          W 24 45  6
##          B  2  0  2
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.5714286

0.5714286

Ed

ps.data = readRDS("processedData/df.location.RDS")
set.seed(414)
### Functions

loss = function(y, f.hat, c){
  #Loss Function
  a = sum(y == "True" & f.hat == "False")
  b = sum(y == "False" & f.hat == "True")

  out = a*c + b
  return(out)
}
rf = function(X.train, Y.train, X.test, Y.test, thres, c = 9) {
  #Performs the Random Forest. Returns the confusion matrix,
  #the class error rates, and the value of the loss function
  rf = randomForest(X.train, Y.train)
  pred.rf = predict(rf, X.test, type = "prob")
  t.num = which(colnames(pred.rf) == "True")
  pred.table = unname(ifelse(pred.rf[,t.num] > thres, "True", "False"))
  pred.table = cbind(pred.table,as.character(Y.test))

  out = table(pred.table[,1],pred.table[,2])
  false.err = 1-sum(pred.table[,1] == "False" & pred.table[,2] == "False")/sum(pred.table[,2] == "False")
  true.err = 1-sum(pred.table[,1] == "True" & pred.table[,2] == "True")/sum(pred.table[,2] == "True")

  loss = loss(y = Y.test, f.hat = pred.table[,1], c)

  return(list(out,false.err,true.err,loss))
}
folds = function(n, k){
  #Creates k folds. Used for the CV function
  size = round(n/k)
  out = list()
  start = 0
  indices = sample(1:n,n,replace = FALSE)
  for(i in 1:k) {
    if(i < k) out[[i]] = indices[(start+1):(start+size)]
    else out[[i]] = indices[(start+1):n]
    start = start+size
  }
  return(out)
}
up = function(X.train, Y.train){
  #Up Sample the minority class
  bcam.t = which(Y.train == "True")
  bcam.f = which(Y.train == "False")

  up.t = sample(bcam.t,length(bcam.f),replace = TRUE)
  up.f = sample(bcam.f,length(bcam.f),replace = FALSE)
  Y.up = Y.train[c(up.t,up.f)]
  X.up = X.train[c(up.t,up.f),]

  return(list(X.up,Y.up))
}
down = function(X.train, Y.train){
  #Down Sample the majority class
  bcam.t = which(Y.train == "True")
  bcam.f = which(Y.train == "False")

  down.t = sample(bcam.t,length(bcam.t),replace = FALSE)
  down.f = sample(bcam.f,length(bcam.t),replace = FALSE)
  Y.down = Y.train[c(down.t,down.f)]
  X.down = X.train[c(down.t,down.f),]

  return(list(X.down,Y.down))
}
cv.rf = function(X, Y, thres, c = 9, k = 5, r = 1,
                 sample = c("none","up","down")){
  #Perform k-fold cross validation r times. Returns the loss
  #along with the error rate for the "true" class
  loss = 0
  true.err = 0
  for(j in 1:r) {
    n = length(Y)
    sets = folds(n, k)
    for(i in 1:k) {
      X.train = X[unlist(sets[-i]),]
      Y.train = Y[unlist(sets[-i])]
      X.test = X[sets[[i]],]
      Y.test = Y[sets[[i]]]

      if(sample == "up") {
        temp = up(X.train,Y.train)
        X.train = temp[[1]]
        Y.train = temp[[2]]
      }
      if(sample == "down") {
        temp = down(X.train,Y.train)
        X.train = temp[[1]]
        Y.train = temp[[2]]
      }

      results = rf(X.train, Y.train, X.test, Y.test, thres, c)
      loss = loss + results[[4]]
      true.err = true.err + results[[3]]
    }
  }
  return(list(loss/(k*r),(true.err)/(k*r)))
}
### Data Cleaning
Y = as.factor(ps.data$body_camera)
X = as.data.frame(ps.data[,c(4:14,16,17)])
Y = Y[-c(1935,2050)]
X = X[-c(1935,2050),]

#omit missing data
Y = Y[complete.cases(X)]
X = X[complete.cases(X),]

#subset "Armed"
X$armed = as.character(X$armed)
gun = grep("gun", X$armed)
X$armed[gun] = "gun"
other = which(X$armed != "gun" &
                X$armed != "unarmed" &
                X$armed != "undetermined" &
                X$armed != "knife" &
                X$armed != "vehicle" &
                X$armed != "toy weapon")
X$armed[other] = "other"
X$armed = as.factor(X$armed)
#remove city/state variables
X = X[-c(6:8)]
#create a training/test set
testset = function(X, Y, size = 500) {
  draws = sample(0:length(Y),length(Y),replace = FALSE)
  test = draws[1:size]
  train = draws[(size+1):length(Y)]

  Y.test = Y[test]
  X.test = X[test,]
  Y.train = Y[train]
  X.train = X[train,]
  return(list(X.train,Y.train,X.test,Y.test))
}

test = testset(X, Y, 500)
X.train = test[[1]]
Y.train = test[[2]]
X.test = test[[3]]
Y.test = test[[4]]

Data Analysis

rf.train = randomForest(X.train,Y.train)

construct the oversampled training set

bcam.t = which(Y.train == "True")
bcam.f = which(Y.train == "False")

up.t = sample(bcam.t,length(bcam.f),replace = TRUE)
up.f = sample(bcam.f,length(bcam.f),replace = FALSE)
Y.up = Y.train[c(up.t,up.f)]
X.up = X.train[c(up.t,up.f),]

fit random forest and calculate cv error

rf.up = randomForest(X.up,Y.up)
cv.rf(X.train,Y.train,.5,sample = "up")
## [[1]]
## [1] 314.8
## 
## [[2]]
## [1] 0.9291268

same as above for undersampled training set

down.t = sample(bcam.t,length(bcam.t),replace = FALSE)
down.f = sample(bcam.f,length(bcam.t),replace = FALSE)
Y.down = Y.train[c(down.t,down.f)]
X.down = X.train[c(down.t,down.f),]
rf.down = randomForest(X.down,Y.down)
cv.rf(X.train,Y.train,.5,sample = "down")
## [[1]]
## [1] 275
## 
## [[2]]
## [1] 0.3993785

For regular training set, search for optimal k

thresholds = c(0.001, 0.01, 0.1, .25, .5)

for(x in thresholds){
  print(c(x,cv.rf(X.train,Y.train,x, r = 5, c = 9, sample = "none")[[1]]))
}
## [1]   0.001 298.280
## [1]   0.01 287.76
## [1]   0.10 268.92
## [1]   0.25 308.48
## [1]   0.50 326.36
thresholds = 0.1+0.01*(0:10)

for(x in thresholds){
  print(c(x,cv.rf(X.train,Y.train,x, r = 5, c = 9, sample = "none")[[1]]))
}
## [1]   0.10 270.88
## [1]   0.11 278.80
## [1]   0.12 274.92
## [1]   0.13 274.52
## [1]   0.14 278.52
## [1]   0.15 284.68
## [1]   0.16 282.32
## [1]   0.17 288.64
## [1]   0.18 291.12
## [1]   0.19 290.68
## [1]   0.20 296.48
rf(X.train,Y.train,X.test,Y.test,.10)
## [[1]]
##        
##         False True
##   False   255   22
##   True    189   34
## 
## [[2]]
## [1] 0.4256757
## 
## [[3]]
## [1] 0.3928571
## 
## [[4]]
## [1] 387
rf(X.train,Y.train,X.train,Y.train,.1)
## [[1]]
##        
##         False True
##   False  1388    0
##   True    107  182
## 
## [[2]]
## [1] 0.07157191
## 
## [[3]]
## [1] 0
## 
## [[4]]
## [1] 107

For oversampled training set, search for optimal k

thresholds = c(0.001, 0.01, 0.1, .25, .5)

for(x in thresholds){
  print(c(x,cv.rf(X.train,Y.train,x, r = 5, c = 9,sample = "up")[[1]]))
}
## [1]   0.001 298.800
## [1]   0.01 293.16
## [1]   0.10 273.48
## [1]   0.25 275.32
## [1]   0.5 316.6
thresholds = 0.1+0.01*(0:10)

for(x in thresholds){
  print(c(x,cv.rf(X.train,Y.train,x, r = 5, c = 9, sample = "up")[[1]]))
}
## [1]   0.10 271.28
## [1]   0.11 268.68
## [1]   0.12 266.40
## [1]   0.13 268.84
## [1]   0.14 270.08
## [1]   0.15 261.64
## [1]   0.16 267.60
## [1]   0.17 264.08
## [1]   0.18 270.16
## [1]   0.19 273.80
## [1]   0.20 274.88
rf(X.up,Y.up,X.test,Y.test,.17)
## [[1]]
##        
##         False True
##   False   257   23
##   True    187   33
## 
## [[2]]
## [1] 0.4211712
## 
## [[3]]
## [1] 0.4107143
## 
## [[4]]
## [1] 394
rf(X.up,Y.up,X.up,Y.up,.17)
## [[1]]
##        
##         False True
##   False  1315    0
##   True    180 1495
## 
## [[2]]
## [1] 0.1204013
## 
## [[3]]
## [1] 0
## 
## [[4]]
## [1] 180

For undersampled training set, search for optimal k

thresholds = c(0.001, 0.01, 0.1, .25, .5)

for(x in thresholds){
  print(c(x,cv.rf(X.train,Y.train,x, r = 5, c = 9, sample = "down")[[1]]))
}
## [1]   0.001 299.000
## [1]   0.01 299.00
## [1]   0.10 298.08
## [1]   0.25 288.00
## [1]   0.5 276.2
thresholds = 0.4+0.01*(0:10)

for(x in thresholds){
  print(c(x,cv.rf(X.train,Y.train,x, r = 5, c = 9, sample = "down")[[1]]))
}
## [1]   0.4 269.0
## [1]   0.41 265.40
## [1]   0.42 261.20
## [1]   0.43 271.68
## [1]   0.44 267.80
## [1]   0.45 265.76
## [1]   0.46 270.52
## [1]   0.47 268.52
## [1]   0.48 278.04
## [1]   0.49 266.96
## [1]   0.50 278.04
rf(X.down,Y.down,X.down,Y.down,.5)
## [[1]]
##        
##         False True
##   False   182    0
##   True      0  182
## 
## [[2]]
## [1] 0
## 
## [[3]]
## [1] 0
## 
## [[4]]
## [1] 0